library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data.set <- read.table("dataLA.prn", header=TRUE, na.strings="*")
p.10 <- ggplot(data=data.set, aes(keystroke10, colour=name)) +
geom_histogram(fill="white")
p.10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.20 <- ggplot(data=data.set, aes(keystroke20, colour=name)) +
geom_histogram(fill="white")
p.20
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.30 <- ggplot(data=data.set, aes(keystroke30, colour=name)) +
geom_histogram(fill="white")
p.30
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.40 <- ggplot(data=data.set, aes(keystroke40, colour=name)) +
geom_histogram(fill="white")
p.40
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.50 <- ggplot(data=data.set, aes(keystroke50, colour=name)) +
geom_histogram(fill="white")
p.50
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.60 <- ggplot(data=data.set, aes(keystroke60, colour=name)) +
geom_histogram(fill="white")
p.60
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.70 <- ggplot(data=data.set, aes(keystroke70, colour=name)) +
geom_histogram(fill="white")
p.70
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.80 <- ggplot(data=data.set, aes(keystroke80, colour=name)) +
geom_histogram(fill="white")
p.80
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.90 <- ggplot(data=data.set, aes(keystroke90, colour=name)) +
geom_histogram(fill="white")
p.90
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.100 <- ggplot(data=data.set, aes(keystroke100, colour=name)) +
geom_histogram(fill="white")
p.100
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
require(corrplot)
## Loading required package: corrplot
## corrplot 0.84 loaded
library(corrplot)
data.set$essay_typebin <- ifelse(data.set$essay_type == "True", 1, 0)
cor_data <- data.set[, 3:21]
corr_matrix <- cor(cor_data)
#simple correlation matrix
round(corr_matrix, 2)
## keystroke10 keystroke20 keystroke30 keystroke40 keystroke50
## keystroke10 1.00 0.28 0.23 0.29 0.06
## keystroke20 0.28 1.00 0.47 0.37 0.40
## keystroke30 0.23 0.47 1.00 0.34 0.45
## keystroke40 0.29 0.37 0.34 1.00 0.48
## keystroke50 0.06 0.40 0.45 0.48 1.00
## keystroke60 0.03 0.43 0.37 0.30 0.58
## keystroke70 0.15 0.03 0.18 0.35 0.10
## keystroke80 -0.01 0.20 0.15 0.12 0.34
## keystroke90 0.08 0.13 0.06 0.12 0.17
## keystroke100 -0.05 0.09 0.06 0.02 0.03
## keystroke110 -0.02 -0.13 0.02 -0.04 -0.08
## keystroke120 -0.01 0.01 -0.13 0.00 -0.08
## keystroke130 -0.07 -0.08 0.04 -0.12 -0.04
## keystroke140 -0.10 -0.11 -0.17 -0.08 -0.14
## keystroke150 0.04 -0.13 -0.16 -0.06 -0.15
## keystroke160 -0.03 -0.09 -0.10 -0.13 -0.08
## keystroke170 0.00 -0.14 -0.18 -0.11 -0.12
## ouliers 0.00 -0.18 -0.20 -0.16 -0.14
## essay_sent 0.01 -0.04 -0.01 -0.02 -0.03
## keystroke60 keystroke70 keystroke80 keystroke90 keystroke100
## keystroke10 0.03 0.15 -0.01 0.08 -0.05
## keystroke20 0.43 0.03 0.20 0.13 0.09
## keystroke30 0.37 0.18 0.15 0.06 0.06
## keystroke40 0.30 0.35 0.12 0.12 0.02
## keystroke50 0.58 0.10 0.34 0.17 0.03
## keystroke60 1.00 0.30 0.43 0.25 0.12
## keystroke70 0.30 1.00 0.21 0.22 0.13
## keystroke80 0.43 0.21 1.00 0.49 0.27
## keystroke90 0.25 0.22 0.49 1.00 0.41
## keystroke100 0.12 0.13 0.27 0.41 1.00
## keystroke110 -0.22 0.09 0.01 0.08 0.39
## keystroke120 -0.16 -0.14 0.09 0.26 0.33
## keystroke130 -0.10 -0.17 -0.04 0.05 0.31
## keystroke140 -0.22 -0.12 -0.21 -0.09 0.17
## keystroke150 -0.22 -0.07 -0.14 -0.01 0.02
## keystroke160 -0.15 -0.17 -0.04 -0.06 0.01
## keystroke170 -0.18 -0.08 -0.12 -0.01 -0.03
## ouliers -0.18 -0.12 -0.08 -0.06 -0.02
## essay_sent -0.04 0.00 -0.06 -0.04 -0.04
## keystroke110 keystroke120 keystroke130 keystroke140 keystroke150
## keystroke10 -0.02 -0.01 -0.07 -0.10 0.04
## keystroke20 -0.13 0.01 -0.08 -0.11 -0.13
## keystroke30 0.02 -0.13 0.04 -0.17 -0.16
## keystroke40 -0.04 0.00 -0.12 -0.08 -0.06
## keystroke50 -0.08 -0.08 -0.04 -0.14 -0.15
## keystroke60 -0.22 -0.16 -0.10 -0.22 -0.22
## keystroke70 0.09 -0.14 -0.17 -0.12 -0.07
## keystroke80 0.01 0.09 -0.04 -0.21 -0.14
## keystroke90 0.08 0.26 0.05 -0.09 -0.01
## keystroke100 0.39 0.33 0.31 0.17 0.02
## keystroke110 1.00 0.35 0.32 0.35 0.23
## keystroke120 0.35 1.00 0.54 0.38 0.43
## keystroke130 0.32 0.54 1.00 0.46 0.48
## keystroke140 0.35 0.38 0.46 1.00 0.58
## keystroke150 0.23 0.43 0.48 0.58 1.00
## keystroke160 0.12 0.41 0.50 0.51 0.60
## keystroke170 0.07 0.30 0.32 0.47 0.61
## ouliers 0.07 0.28 0.35 0.48 0.58
## essay_sent 0.00 -0.04 -0.01 0.00 -0.02
## keystroke160 keystroke170 ouliers essay_sent
## keystroke10 -0.03 0.00 0.00 0.01
## keystroke20 -0.09 -0.14 -0.18 -0.04
## keystroke30 -0.10 -0.18 -0.20 -0.01
## keystroke40 -0.13 -0.11 -0.16 -0.02
## keystroke50 -0.08 -0.12 -0.14 -0.03
## keystroke60 -0.15 -0.18 -0.18 -0.04
## keystroke70 -0.17 -0.08 -0.12 0.00
## keystroke80 -0.04 -0.12 -0.08 -0.06
## keystroke90 -0.06 -0.01 -0.06 -0.04
## keystroke100 0.01 -0.03 -0.02 -0.04
## keystroke110 0.12 0.07 0.07 0.00
## keystroke120 0.41 0.30 0.28 -0.04
## keystroke130 0.50 0.32 0.35 -0.01
## keystroke140 0.51 0.47 0.48 0.00
## keystroke150 0.60 0.61 0.58 -0.02
## keystroke160 1.00 0.61 0.75 -0.02
## keystroke170 0.61 1.00 0.78 0.00
## ouliers 0.75 0.78 1.00 -0.02
## essay_sent -0.02 0.00 -0.02 1.00
#visualized
corrplot(corr_matrix, type = 'upper', order = "hclust",
tl.col = "black", tl.srt = 45)
We will add keystokes over 110 to outliers later because there is a strong correlation ### Plot Data Construction
val.vec <- c()
type.vec <- c()
sent.vec <- c()
theme.vec <- c()
for (i in 1:nrow(data.set)) {
val <- max(data.set[i, 3:19])
type <- data.set[i, 22]
sent <- data.set[i, 21]
theme <- data.set[i, 2]
val.vec <- c(val.vec, val)
type.vec <- c(type.vec, type)
sent.vec <- c(sent.vec, sent)
theme.vec <- c(theme.vec, theme)
}
plot.data <- data.frame(theme.vec, val.vec, type.vec, sent.vec)
colnames(plot.data) <- c('Theme', 'MaxKeystroke', "TypeofEssay", "Sentiment")
#Gay Marriage
ggplot(data=plot.data %>% filter(plot.data$Theme=='GayMerriage'), aes(MaxKeystroke, colour=TypeofEssay)) + geom_histogram() + ggtitle("Histogram of maximum keystroke over the essay on Gay Marriage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=plot.data %>% filter(plot.data$Theme=='GayMerriage'), aes(x = MaxKeystroke, y =Sentiment, colour=TypeofEssay)) + geom_point() + ggtitle("Dependency of maximum keystroke over the essay on Gay Marriage and its sentiment")
#Gun Control
ggplot(data=plot.data %>% filter(plot.data$Theme=='GunControl'), aes(MaxKeystroke, colour=TypeofEssay)) + geom_histogram() + ggtitle("Histogram of maximum keystroke over the essay on Gun Control")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=plot.data %>% filter(plot.data$Theme=='GunControl'), aes(x = MaxKeystroke, y =Sentiment, colour=TypeofEssay)) + geom_point() + ggtitle("Dependency of maximum keystroke over the essay on Gun Control and its sentiment")
#ReviewAMT
ggplot(data=plot.data %>% filter(plot.data$Theme=='ReviewAMT'), aes(MaxKeystroke, colour=TypeofEssay)) + geom_histogram() + ggtitle("Dependency of maximum keystroke over the essay on Review and its sentiment")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=plot.data %>% filter(plot.data$Theme=='ReviewAMT'), aes(x = MaxKeystroke, y =Sentiment, colour=TypeofEssay)) + geom_point() + ggtitle("Histogram of maximum keystroke over the essay on Review")
People writing about Gay Marriage or Gun Control try to keep neutural position compared to essays on Review of the AMT where sentiment is either very negative or very positive. Also we can say that huge keystroke almost never correspond to neutural opinion, so people take more time to write positive or negative thoughts.
model.1 <- lm(essay_sent ~ keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 +
keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 +
keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
+ keystroke170 , data=data.set)
summary(model.1)
##
## Call:
## lm(formula = essay_sent ~ keystroke10 + keystroke20 + keystroke30 +
## keystroke30 + keystroke40 + keystroke50 + keystroke60 + keystroke70 +
## keystroke80 + keystroke90 + keystroke100 + keystroke110 +
## keystroke120 + keystroke130 + keystroke140 + keystroke150 +
## keystroke160 + keystroke170, data = data.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.730 -3.383 1.605 2.437 3.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9150298 0.1152094 51.342 <2e-16 ***
## keystroke10 0.0024895 0.0023522 1.058 0.290
## keystroke20 -0.0043525 0.0029372 -1.482 0.138
## keystroke30 0.0006874 0.0042020 0.164 0.870
## keystroke40 -0.0001009 0.0032481 -0.031 0.975
## keystroke50 -0.0021723 0.0040944 -0.531 0.596
## keystroke60 -0.0010607 0.0027784 -0.382 0.703
## keystroke70 0.0013293 0.0026067 0.510 0.610
## keystroke80 -0.0054675 0.0028725 -1.903 0.057 .
## keystroke90 -0.0012953 0.0034029 -0.381 0.703
## keystroke100 -0.0034422 0.0026537 -1.297 0.195
## keystroke110 0.0024434 0.0035609 0.686 0.493
## keystroke120 -0.0072593 0.0049708 -1.460 0.144
## keystroke130 0.0072175 0.0066466 1.086 0.278
## keystroke140 0.0029920 0.0052392 0.571 0.568
## keystroke150 -0.0192753 0.0129616 -1.487 0.137
## keystroke160 -0.0146290 0.0127138 -1.151 0.250
## keystroke170 0.0156815 0.0199026 0.788 0.431
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.682 on 4782 degrees of freedom
## Multiple R-squared: 0.007395, Adjusted R-squared: 0.003866
## F-statistic: 2.096 on 17 and 4782 DF, p-value: 0.005268
ggplot(data=data.set, aes(x = keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 +
keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 +
keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
+ keystroke170, y= essay_sent)) + geom_point() + geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'
model_pred <- predict(model.1)
model.res <- data.set$essay_sent - model_pred
plot(model_pred, model.res)
abline(h=0)
Fighting the problem of too many variables, because they become insignificant. Let`s sum over keystroke
vec <- c()
for (i in 1:nrow(data.set)) {
vec <- c(vec, sum(data.set[i, 3:10]))
}
data.set$sum.10.80 <- vec
ind <- 90
for (i in 11:14) {
ind.str <- paste(toString(ind),"bin",sep="")
treshold <- mean(data.set[, i]) - 0.4*mean(data.set[, i])
data.set[, i] <- ifelse(data.set[, i] >= treshold, 1, 0)
ind <- ind + 10
}
### adding keystroke >= 120 to outliers
for ( i in 1:nrow(data.set)) {
data.set[i, 20] <- data.set[i, 20] + sum(data.set[i, 14:19])
}
model.2 <- lm(essay_sent ~ sum.10.80 + keystroke90 + keystroke100 +
keystroke110 + ouliers, data=data.set)
summary(model.2)
##
## Call:
## lm(formula = essay_sent ~ sum.10.80 + keystroke90 + keystroke100 +
## keystroke110 + ouliers, data = data.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.575 -3.403 1.626 2.445 3.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9614775 0.1308253 45.568 < 2e-16 ***
## sum.10.80 -0.0014267 0.0005039 -2.831 0.00465 **
## keystroke90 -0.2407210 0.0868967 -2.770 0.00562 **
## keystroke100 0.0811285 0.0959841 0.845 0.39803
## keystroke110 -0.0044195 0.0912404 -0.048 0.96137
## ouliers -0.0025744 0.0011270 -2.284 0.02240 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.683 on 4794 degrees of freedom
## Multiple R-squared: 0.00414, Adjusted R-squared: 0.003101
## F-statistic: 3.986 on 5 and 4794 DF, p-value: 0.001309
library(rsq)
library(ggplot2)
poisson.model <- glm(essay_sent ~ sum.10.80 + keystroke90 + keystroke100 +
keystroke110 + ouliers + factor(name), data=data.set, family = poisson(link = "log"))
summary(poisson.model)
##
## Call:
## glm(formula = essay_sent ~ sum.10.80 + keystroke90 + keystroke100 +
## keystroke110 + ouliers + factor(name), family = poisson(link = "log"),
## data = data.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4899 -1.3529 0.2351 0.5320 1.5275
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.514e+00 2.404e-02 62.972 <2e-16 ***
## sum.10.80 1.542e-04 8.233e-05 1.873 0.0611 .
## keystroke90 4.103e-03 1.371e-02 0.299 0.7648
## keystroke100 2.424e-02 1.524e-02 1.591 0.1117
## keystroke110 -1.347e-02 1.449e-02 -0.929 0.3526
## ouliers 2.434e-04 1.826e-04 1.333 0.1826
## factor(name)GunControl -3.798e-02 1.637e-02 -2.320 0.0203 *
## factor(name)ReviewAMT 4.263e-01 1.497e-02 28.465 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7072.0 on 4799 degrees of freedom
## Residual deviance: 5832.6 on 4792 degrees of freedom
## AIC: 22309
##
## Number of Fisher Scoring iterations: 4
rsq(poisson.model,adj=TRUE)
## [1] 0.2046907
plot(predict(poisson.model), data.set$essay_sent, xlab="predicted",ylab="actual")
abline(a=0,b=1)
data.set$essay_sentbin <- ifelse(data.set$essay_sent >= 5, 1, 0)
logistic.model <- glm(essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 +
keystroke110 + ouliers , data=data.set, family = "binomial")
summary(logistic.model)
##
## Call:
## glm(formula = essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 +
## keystroke110 + ouliers, family = "binomial", data = data.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4547 -1.3236 0.9754 1.0280 1.3731
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6611704 0.0992363 6.663 2.69e-11 ***
## sum.10.80 -0.0008856 0.0003794 -2.334 0.0196 *
## keystroke90 -0.1674460 0.0663682 -2.523 0.0116 *
## keystroke100 0.0649890 0.0728704 0.892 0.3725
## keystroke110 -0.0213246 0.0693270 -0.308 0.7584
## ouliers -0.0019778 0.0008508 -2.325 0.0201 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6494.9 on 4799 degrees of freedom
## Residual deviance: 6478.7 on 4794 degrees of freedom
## AIC: 6490.7
##
## Number of Fisher Scoring iterations: 4
plot(model_pred, data.set$essay_sentbin)
model_pred <- predict(logistic.model)
model.res <- data.set$essay_sentbin - model_pred
plot(model_pred, model.res)
abline(h=0)
logistic.model.type <- glm(essay_typebin ~ sum.10.80 + ouliers , data=data.set, family = "binomial")
summary(logistic.model.type)
##
## Call:
## glm(formula = essay_typebin ~ sum.10.80 + ouliers, family = "binomial",
## data = data.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8028 -1.1571 -0.2197 1.1890 1.3287
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4230621 0.0806654 -5.245 1.57e-07 ***
## sum.10.80 0.0021525 0.0003881 5.547 2.91e-08 ***
## ouliers 0.0022115 0.0008285 2.669 0.0076 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6654.2 on 4799 degrees of freedom
## Residual deviance: 6620.4 on 4797 degrees of freedom
## AIC: 6626.4
##
## Number of Fisher Scoring iterations: 4
data.set.scaled <- data.set %>% mutate_at(c("keystroke10", "keystroke20", "keystroke30", "keystroke40", "keystroke50", "keystroke60", "keystroke70", "keystroke80", "keystroke90", "keystroke100", "keystroke110", "keystroke120", "keystroke130", "keystroke140", "keystroke150", "keystroke160", "keystroke170"), ~(scale(.) %>% as.vector))
Doesn`t work for us, because we get negative values for keystrokes
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
summary(data.set)
## id name keystroke10 keystroke20
## Min. : 0 Length:4800 Min. : 0.00 Min. : 0.00
## 1st Qu.:1200 Class :character 1st Qu.: 2.00 1st Qu.: 10.00
## Median :2400 Mode :character Median : 11.00 Median : 17.00
## Mean :2400 Mean : 15.49 Mean : 20.97
## 3rd Qu.:3599 3rd Qu.: 22.00 3rd Qu.: 28.00
## Max. :4799 Max. :233.00 Max. :187.00
## keystroke30 keystroke40 keystroke50 keystroke60
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 8.00 1st Qu.: 7.00 1st Qu.: 14.00
## Median : 13.00 Median : 14.00 Median : 13.00 Median : 23.00
## Mean : 15.21 Mean : 16.82 Mean : 15.54 Mean : 28.07
## 3rd Qu.: 19.25 3rd Qu.: 21.00 3rd Qu.: 20.00 3rd Qu.: 37.00
## Max. :159.00 Max. :299.00 Max. :218.00 Max. :229.00
## keystroke70 keystroke80 keystroke90 keystroke100
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 11.00 1st Qu.: 14.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 19.00 Median : 22.00 Median :1.0000 Median :1.0000
## Mean : 22.34 Mean : 25.64 Mean :0.6723 Mean :0.7192
## 3rd Qu.: 29.00 3rd Qu.: 33.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :219.00 Max. :172.00 Max. :1.0000 Max. :1.0000
## keystroke110 keystroke120 keystroke130 keystroke140
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 2.000 1st Qu.: 2.000
## Median :1.0000 Median :1.0000 Median : 5.000 Median : 5.000
## Mean :0.6523 Mean :0.5723 Mean : 7.471 Mean : 8.732
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 11.000 3rd Qu.:12.000
## Max. :1.0000 Max. :1.0000 Max. :152.000 Max. :97.000
## keystroke150 keystroke160 keystroke170 ouliers
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 11.00
## Median : 2.000 Median : 1.000 Median : 1.000 Median : 24.00
## Mean : 3.268 Mean : 3.002 Mean : 1.625 Mean : 35.95
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.: 47.00
## Max. :43.000 Max. :46.000 Max. :31.000 Max. :368.00
## essay_sent essay_type essay_typebin sum.10.80
## Min. :1.000 Length:4800 Min. :0.0 Min. : 4.0
## 1st Qu.:2.000 Class :character 1st Qu.:0.0 1st Qu.:111.0
## Median :7.000 Mode :character Median :0.5 Median :148.0
## Mean :5.534 Mean :0.5 Mean :160.1
## 3rd Qu.:8.000 3rd Qu.:1.0 3rd Qu.:190.0
## Max. :8.000 Max. :1.0 Max. :935.0
## essay_sentbin
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5908
## 3rd Qu.:1.0000
## Max. :1.0000
data.set$essay_sent3 <- ifelse(data.set$essay_sent >= 6, "positive", ifelse(data.set$essay_sent <4, "negative", "neutural"))
model.polr <- polr(factor(essay_sent3)~essay_typebin + keystroke90 + keystroke100 + keystroke110 + keystroke120 + factor(name), data = data.set, Hess = TRUE)
summary(model.polr)
## Call:
## polr(formula = factor(essay_sent3) ~ essay_typebin + keystroke90 +
## keystroke100 + keystroke110 + keystroke120 + factor(name),
## data = data.set, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## essay_typebin -0.10718 0.06016 -1.7815
## keystroke90 0.05783 0.06959 0.8311
## keystroke100 0.07985 0.07441 1.0731
## keystroke110 -0.02684 0.07235 -0.3710
## keystroke120 -0.05841 0.06891 -0.8476
## factor(name)GunControl -0.09055 0.06548 -1.3828
## factor(name)ReviewAMT 2.29116 0.09020 25.4009
##
## Intercepts:
## Value Std. Error t value
## negative|neutural -0.3488 0.0864 -4.0364
## neutural|positive 0.3790 0.0865 4.3792
##
## Residual Deviance: 8256.865
## AIC: 8274.865
(ctable <- coef(summary(model.polr)))
## Value Std. Error t value
## essay_typebin -0.10718102 0.06016445 -1.7814676
## keystroke90 0.05783411 0.06958906 0.8310804
## keystroke100 0.07984790 0.07440792 1.0731102
## keystroke110 -0.02684282 0.07234750 -0.3710263
## keystroke120 -0.05840766 0.06891146 -0.8475754
## factor(name)GunControl -0.09054923 0.06548123 -1.3828272
## factor(name)ReviewAMT 2.29116469 0.09020027 25.4008641
## negative|neutural -0.34878002 0.08640819 -4.0364231
## neutural|positive 0.37895330 0.08653444 4.3792195
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
## Value Std. Error t value p value
## essay_typebin -0.10718102 0.06016445 -1.7814676 7.483609e-02
## keystroke90 0.05783411 0.06958906 0.8310804 4.059282e-01
## keystroke100 0.07984790 0.07440792 1.0731102 2.832217e-01
## keystroke110 -0.02684282 0.07234750 -0.3710263 7.106180e-01
## keystroke120 -0.05840766 0.06891146 -0.8475754 3.966745e-01
## factor(name)GunControl -0.09054923 0.06548123 -1.3828272 1.667179e-01
## factor(name)ReviewAMT 2.29116469 0.09020027 25.4008641 2.467032e-142
## negative|neutural -0.34878002 0.08640819 -4.0364231 5.427231e-05
## neutural|positive 0.37895330 0.08653444 4.3792195 1.191051e-05
Here i deleted every observation where number of keystrokes was less than on 0.95 conf interval
library(dplyr)
data.set.filtered <- data.set %>% filter(data.set$keystroke10 <= quantile(data.set$keystroke10, c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke20 <= quantile(data.set.filtered$keystroke20,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke30 <= quantile(data.set.filtered$keystroke30,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke40 <= quantile(data.set.filtered$keystroke40,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke50 <= quantile(data.set.filtered$keystroke50,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke60 <= quantile(data.set.filtered$keystroke60,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke70 <= quantile(data.set.filtered$keystroke70,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke80 <= quantile(data.set.filtered$keystroke80,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke90 <= quantile(data.set.filtered$keystroke90,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke100 <= quantile(data.set.filtered$keystroke100,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke110 <= quantile(data.set.filtered$keystroke110,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke120 <= quantile(data.set.filtered$keystroke120,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke130 <= quantile(data.set.filtered$keystroke130,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke140 <= quantile(data.set.filtered$keystroke140,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke150 <= quantile(data.set.filtered$keystroke150,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke160 <= quantile(data.set.filtered$keystroke160,c(.95)))
data.set.filtered <- data.set.filtered %>% filter(data.set.filtered$keystroke170 <= quantile(data.set.filtered$keystroke170,c(.95)))
p.10 <- ggplot(data=data.set.filtered, aes(keystroke10, colour=name)) +
geom_histogram(fill="white")
p.10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.20 <- ggplot(data=data.set.filtered, aes(keystroke20, colour=name)) +
geom_histogram(fill="white")
p.20
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.30 <- ggplot(data=data.set.filtered, aes(keystroke30, colour=name)) +
geom_histogram(fill="white")
p.30
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.40 <- ggplot(data=data.set.filtered, aes(keystroke40, colour=name)) +
geom_histogram(fill="white")
p.40
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.50 <- ggplot(data=data.set.filtered, aes(keystroke50, colour=name)) +
geom_histogram(fill="white")
p.50
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.60 <- ggplot(data=data.set.filtered, aes(keystroke60, colour=name)) +
geom_histogram(fill="white")
p.60
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.70 <- ggplot(data=data.set.filtered, aes(keystroke70, colour=name)) +
geom_histogram(fill="white")
p.70
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.80 <- ggplot(data=data.set.filtered, aes(keystroke80, colour=name)) +
geom_histogram(fill="white")
p.80
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p.90 <- ggplot(data=data.set.filtered, aes(keystroke90, colour=name)) +
geom_histogram(fill="white")
p.90
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model.1 <- lm(essay_sent ~ keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 +
keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 +
keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
+ keystroke170 , data=data.set.filtered)
summary(model.1)
##
## Call:
## lm(formula = essay_sent ~ keystroke10 + keystroke20 + keystroke30 +
## keystroke30 + keystroke40 + keystroke50 + keystroke60 + keystroke70 +
## keystroke80 + keystroke90 + keystroke100 + keystroke110 +
## keystroke120 + keystroke130 + keystroke140 + keystroke150 +
## keystroke160 + keystroke170, data = data.set.filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.922 -3.325 1.723 2.367 3.224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.292198 0.277062 22.710 <2e-16 ***
## keystroke10 -0.002537 0.005667 -0.448 0.6544
## keystroke20 0.008063 0.006945 1.161 0.2458
## keystroke30 0.005518 0.010600 0.521 0.6027
## keystroke40 -0.014789 0.009444 -1.566 0.1175
## keystroke50 -0.005756 0.011659 -0.494 0.6216
## keystroke60 -0.007546 0.006422 -1.175 0.2401
## keystroke70 0.006080 0.007171 0.848 0.3966
## keystroke80 -0.012943 0.006103 -2.121 0.0340 *
## keystroke90 -0.077262 0.150457 -0.514 0.6076
## keystroke100 0.058055 0.150124 0.387 0.6990
## keystroke110 -0.115113 0.141491 -0.814 0.4160
## keystroke120 -0.184520 0.137898 -1.338 0.1810
## keystroke130 -0.008764 0.013688 -0.640 0.5220
## keystroke140 0.025326 0.013683 1.851 0.0643 .
## keystroke150 -0.012435 0.029461 -0.422 0.6730
## keystroke160 -0.012792 0.036552 -0.350 0.7264
## keystroke170 -0.064968 0.055886 -1.163 0.2451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.68 on 2575 degrees of freedom
## Multiple R-squared: 0.01174, Adjusted R-squared: 0.005215
## F-statistic: 1.799 on 17 and 2575 DF, p-value: 0.02299
ggplot(data=data.set.filtered, aes(x = keystroke10 + keystroke20 + keystroke30 + keystroke30 + keystroke40 +
keystroke50 + keystroke60 + keystroke70 + keystroke80 + keystroke90 + keystroke100 +
keystroke110 + keystroke120 + keystroke130 + keystroke140 + keystroke150 + keystroke160
+ keystroke170, y= essay_sent)) + geom_point() + geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'
model_pred <- predict(model.1)
model.res <- data.set.filtered$essay_sent - model_pred
plot(model_pred, model.res)
abline(h=0)
Fighting the problem of too many variables, because they become insignificant. Let`s sum over keystroke
vec <- c()
for (i in 1:nrow(data.set.filtered)) {
vec <- c(vec, sum(data.set.filtered[i, 3:10]))
}
data.set.filtered$sum.10.80 <- vec
ind <- 90
for (i in 11:14) {
ind.str <- paste(toString(ind),"bin",sep="")
treshold <- mean(data.set.filtered[, i]) - 0.4*mean(data.set.filtered[, i])
data.set.filtered[, i] <- ifelse(data.set.filtered[, i] >= treshold, 1, 0)
ind <- ind + 10
}
### adding keystroke >= 120 to outliers
for ( i in 1:nrow(data.set.filtered)) {
data.set.filtered[i, 20] <- data.set.filtered[i, 20] + sum(data.set.filtered[i, 14:19])
}
library(rsq)
library(ggplot2)
poisson.model <- glm(essay_sent ~ sum.10.80 + keystroke90 + keystroke100 +
keystroke110 + ouliers + factor(name), data=data.set.filtered, family = poisson(link = "log"))
summary(poisson.model)
##
## Call:
## glm(formula = essay_sent ~ sum.10.80 + keystroke90 + keystroke100 +
## keystroke110 + ouliers + factor(name), family = poisson(link = "log"),
## data = data.set.filtered)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3980 -1.3355 0.2531 0.4077 1.5595
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.5799853 0.0477884 33.062 < 2e-16 ***
## sum.10.80 0.0001009 0.0002306 0.438 0.66161
## keystroke90 -0.0173143 0.0209522 -0.826 0.40859
## keystroke100 0.0300668 0.0225342 1.334 0.18211
## keystroke110 -0.0252806 0.0203543 -1.242 0.21423
## ouliers -0.0001464 0.0003287 -0.445 0.65606
## factor(name)GunControl -0.0656833 0.0228398 -2.876 0.00403 **
## factor(name)ReviewAMT 0.3936910 0.0197869 19.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3796.5 on 2592 degrees of freedom
## Residual deviance: 3142.9 on 2585 degrees of freedom
## AIC: 12098
##
## Number of Fisher Scoring iterations: 4
rsq(poisson.model,adj=TRUE)
## [1] 0.1996001
plot(predict(poisson.model), data.set.filtered$essay_sent, xlab="predicted",ylab="actual")
abline(a=0,b=1)
data.set.filtered$essay_sentbin <- ifelse(data.set.filtered$essay_sent >= 5, 1, 0)
logistic.model <- glm(essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 +
keystroke110 + ouliers , data=data.set.filtered, family = "binomial")
summary(logistic.model)
##
## Call:
## glm(formula = essay_sentbin ~ sum.10.80 + keystroke90 + keystroke100 +
## keystroke110 + ouliers, family = "binomial", data = data.set.filtered)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5746 -1.3431 0.9539 1.0117 1.1442
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9497085 0.2156885 4.403 1.07e-05 ***
## sum.10.80 -0.0019892 0.0010958 -1.815 0.0695 .
## keystroke90 -0.1772353 0.1048172 -1.691 0.0909 .
## keystroke100 0.0128717 0.1098422 0.117 0.9067
## keystroke110 -0.1156146 0.0993287 -1.164 0.2444
## ouliers -0.0008932 0.0015828 -0.564 0.5726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3477.5 on 2592 degrees of freedom
## Residual deviance: 3469.1 on 2587 degrees of freedom
## AIC: 3481.1
##
## Number of Fisher Scoring iterations: 4
logistic.model.type <- glm(essay_typebin ~ sum.10.80 + ouliers , data=data.set.filtered, family = "binomial")
summary(logistic.model.type)
##
## Call:
## glm(formula = essay_typebin ~ sum.10.80 + ouliers, family = "binomial",
## data = data.set.filtered)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.277 -1.144 -1.098 1.209 1.318
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.396022 0.182820 -2.166 0.0303 *
## sum.10.80 0.001930 0.001046 1.845 0.0650 .
## ouliers 0.001343 0.001469 0.914 0.3607
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3591.0 on 2592 degrees of freedom
## Residual deviance: 3587.6 on 2590 degrees of freedom
## AIC: 3593.6
##
## Number of Fisher Scoring iterations: 3
rsq(logistic.model.type)
## [1] 0.00130308
library(MASS)
summary(data.set)
## id name keystroke10 keystroke20
## Min. : 0 Length:4800 Min. : 0.00 Min. : 0.00
## 1st Qu.:1200 Class :character 1st Qu.: 2.00 1st Qu.: 10.00
## Median :2400 Mode :character Median : 11.00 Median : 17.00
## Mean :2400 Mean : 15.49 Mean : 20.97
## 3rd Qu.:3599 3rd Qu.: 22.00 3rd Qu.: 28.00
## Max. :4799 Max. :233.00 Max. :187.00
## keystroke30 keystroke40 keystroke50 keystroke60
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 8.00 1st Qu.: 7.00 1st Qu.: 14.00
## Median : 13.00 Median : 14.00 Median : 13.00 Median : 23.00
## Mean : 15.21 Mean : 16.82 Mean : 15.54 Mean : 28.07
## 3rd Qu.: 19.25 3rd Qu.: 21.00 3rd Qu.: 20.00 3rd Qu.: 37.00
## Max. :159.00 Max. :299.00 Max. :218.00 Max. :229.00
## keystroke70 keystroke80 keystroke90 keystroke100
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 11.00 1st Qu.: 14.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 19.00 Median : 22.00 Median :1.0000 Median :1.0000
## Mean : 22.34 Mean : 25.64 Mean :0.6723 Mean :0.7192
## 3rd Qu.: 29.00 3rd Qu.: 33.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :219.00 Max. :172.00 Max. :1.0000 Max. :1.0000
## keystroke110 keystroke120 keystroke130 keystroke140
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 2.000 1st Qu.: 2.000
## Median :1.0000 Median :1.0000 Median : 5.000 Median : 5.000
## Mean :0.6523 Mean :0.5723 Mean : 7.471 Mean : 8.732
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 11.000 3rd Qu.:12.000
## Max. :1.0000 Max. :1.0000 Max. :152.000 Max. :97.000
## keystroke150 keystroke160 keystroke170 ouliers
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 11.00
## Median : 2.000 Median : 1.000 Median : 1.000 Median : 24.00
## Mean : 3.268 Mean : 3.002 Mean : 1.625 Mean : 35.95
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.: 47.00
## Max. :43.000 Max. :46.000 Max. :31.000 Max. :368.00
## essay_sent essay_type essay_typebin sum.10.80
## Min. :1.000 Length:4800 Min. :0.0 Min. : 4.0
## 1st Qu.:2.000 Class :character 1st Qu.:0.0 1st Qu.:111.0
## Median :7.000 Mode :character Median :0.5 Median :148.0
## Mean :5.534 Mean :0.5 Mean :160.1
## 3rd Qu.:8.000 3rd Qu.:1.0 3rd Qu.:190.0
## Max. :8.000 Max. :1.0 Max. :935.0
## essay_sentbin essay_sent3
## Min. :0.0000 Length:4800
## 1st Qu.:0.0000 Class :character
## Median :1.0000 Mode :character
## Mean :0.5908
## 3rd Qu.:1.0000
## Max. :1.0000
data.set.filtered$essay_sent3 <- ifelse(data.set.filtered$essay_sent >= 6, "positive", ifelse(data.set.filtered$essay_sent <4, "negative", "neutural"))
model.polr <- polr(factor(essay_sent3)~essay_typebin + keystroke90 + keystroke100 + keystroke110 + keystroke120 + factor(name), data = data.set.filtered, Hess = TRUE)
summary(model.polr)
## Call:
## polr(formula = factor(essay_sent3) ~ essay_typebin + keystroke90 +
## keystroke100 + keystroke110 + keystroke120 + factor(name),
## data = data.set.filtered, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## essay_typebin -0.05978 0.08277 -0.7223
## keystroke90 -0.05280 0.11080 -0.4765
## keystroke100 0.12824 0.11265 1.1384
## keystroke110 -0.05517 0.10253 -0.5380
## keystroke120 -0.14468 0.09808 -1.4752
## factor(name)GunControl -0.16176 0.09179 -1.7623
## factor(name)ReviewAMT 2.10370 0.11573 18.1782
##
## Intercepts:
## Value Std. Error t value
## negative|neutural -0.4673 0.1356 -3.4459
## neutural|positive 0.1961 0.1354 1.4486
##
## Residual Deviance: 4361.228
## AIC: 4379.228
(ctable <- coef(summary(model.polr)))
## Value Std. Error t value
## essay_typebin -0.05978167 0.08276899 -0.7222713
## keystroke90 -0.05279813 0.11080418 -0.4764994
## keystroke100 0.12824405 0.11265257 1.1384032
## keystroke110 -0.05516576 0.10252940 -0.5380482
## keystroke120 -0.14468449 0.09807725 -1.4752095
## factor(name)GunControl -0.16175591 0.09178606 -1.7623145
## factor(name)ReviewAMT 2.10369794 0.11572673 18.1781509
## negative|neutural -0.46726885 0.13560294 -3.4458608
## neutural|positive 0.19607668 0.13535562 1.4486039
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
## Value Std. Error t value p value
## essay_typebin -0.05978167 0.08276899 -0.7222713 4.701277e-01
## keystroke90 -0.05279813 0.11080418 -0.4764994 6.337186e-01
## keystroke100 0.12824405 0.11265257 1.1384032 2.549522e-01
## keystroke110 -0.05516576 0.10252940 -0.5380482 5.905438e-01
## keystroke120 -0.14468449 0.09807725 -1.4752095 1.401562e-01
## factor(name)GunControl -0.16175591 0.09178606 -1.7623145 7.801617e-02
## factor(name)ReviewAMT 2.10369794 0.11572673 18.1781509 7.688278e-74
## negative|neutural -0.46726885 0.13560294 -3.4458608 5.692435e-04
## neutural|positive 0.19607668 0.13535562 1.4486039 1.474482e-01
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
randForest.selection <- randomForest(factor(data.set.filtered$essay_sent3)~., data= data.set.filtered[, c(3:20)])
importance(randForest.selection)
## MeanDecreaseGini
## keystroke10 114.28441
## keystroke20 121.02728
## keystroke30 107.11058
## keystroke40 111.68764
## keystroke50 109.70164
## keystroke60 125.81134
## keystroke70 120.45692
## keystroke80 121.69747
## keystroke90 13.19775
## keystroke100 16.15979
## keystroke110 17.85811
## keystroke120 16.02276
## keystroke130 91.52549
## keystroke140 91.89723
## keystroke150 64.13282
## keystroke160 59.86850
## keystroke170 44.12682
## ouliers 125.33648